import os

import fitz

from zuele import Tokenizer

tok = Tokenizer()



# 定义提取PDF文本的函数
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text


# 主程序
if __name__ == "__main__":
    pdf_dir = "pdf"  # 存储PDF文件的目录
    for filename in os.listdir(pdf_dir):
            pdf_path = os.path.join(pdf_dir, filename)
            text = extract_text_from_pdf(pdf_path)
            print(list(tok.cut(text)))
